/**
 */
#include "../1880v2_test_util.h"
#define OUT
#define IN
#include <iostream>
#include <iomanip>
#include <string>
#include <map>
#include <random>
#include <cfloat>
//#include <boost/math/special_functions/next.hpp>
//#define DBG

using namespace std;

//TODO: get from ctx
static u32 channel = 32; //<! 1880v2 hardcode
static u32 table_h = 32;
static u32 table_w = 8;
static u32 table_hw = table_h * table_w;
static double *lut = (double *)malloc(sizeof(double) * table_hw);



// http://www.enseignement.polytechnique.fr/informatique/INF478/docs/Cpp/en/cpp/types/numeric_limits/epsilon.html
template<class T>
typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type
    almost_equal(T x, T y, int ulp)
{
    // the machine epsilon has to be scaled to the magnitude of the values used
    // and multiplied by the desired precision in ULPs (units in the last place)
    return std::abs(x-y) < std::numeric_limits<T>::epsilon() * std::abs(x+y) * ulp
    // unless the result is subnormal
           || std::abs(x-y) < std::numeric_limits<T>::min();
}
/**
 * pre_data means we test fixed pattern, it should be same sa lut
 */
enum TEST_MODE {
  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
  DATA_COMPARE_ACCURACY,    //generate \range_start to \range_end value that check epsilon
  DATA_COMPARE_U8,          //generate \range_start to \range_end value that check epsilon, result bf16->u8
  TEST_MODE_MAX,
};

static TEST_MODE mode;

static u16 test_pattern[] = {
  0x0000,
  0x38D2,
  0x3952,
  0x399D,
  0x39D2,
  0x3A03,
  0x3A1D,
  0x3A38,
  0x3A52,
  0x3A6C,
  0x3A83,
  0x3A90,
  0x3A9D,
  0x3AAA,
  0x3AB8,
  0x3AC5,
  0x3AD2,
  0x3ADF,
  0x3AEC,
  0x3AF9,
  0x3B03,
  0x3B0A,
  0x3B10,
  0x3B17,
  0x3B1D,
  0x3B24,
  0x3B2A,
  0x3B31,
  0x3B38,
  0x3B3E,
  0x3B45,
  0x3B4B,
  0x3B52,
  0x3B58,
  0x3B5F,
  0x3B65,
  0x3B6C,
  0x3B72,
  0x3B79,
  0x3B80,
  0x3B83,
  0x3B86,
  0x3B8A,
  0x3B8D,
  0x3B90,
  0x3B93,
  0x3B97,
  0x3B9A,
  0x3B9D,
  0x3BA1,
  0x3BA4,
  0x3BA7,
  0x3BAA,
  0x3BAE,
  0x3BB1,
  0x3BB4,
  0x3BB8,
  0x3BBB,
  0x3BBE,
  0x3BC1,
  0x3BC5,
  0x3BC8,
  0x3BCB,
  0x3BCE,
  0x3BD2,
  0x3BD5,
  0x3BD8,
  0x3BDC,
  0x3BDF,
  0x3BE2,
  0x3BE5,
  0x3BE9,
  0x3BEC,
  0x3BEF,
  0x3BF2,
  0x3BF6,
  0x3BF9,
  0x3BFC,
  0x3C00,
  0x3C01,
  0x3C03,
  0x3C05,
  0x3C06,
  0x3C08,
  0x3C0A,
  0x3C0B,
  0x3C0D,
  0x3C0F,
  0x3C10,
  0x3C12,
  0x3C13,
  0x3C15,
  0x3C17,
  0x3C18,
  0x3C1A,
  0x3C1C,
  0x3C1D,
  0x3C1F,
  0x3C21,
  0x3C22,
  0x3C24,
  0x3C25,
  0x3C27,
  0x3C29,
  0x3C2A,
  0x3C2C,
  0x3C2E,
  0x3C2F,
  0x3C31,
  0x3C33,
  0x3C34,
  0x3C36,
  0x3C38,
  0x3C39,
  0x3C3B,
  0x3C3C,
  0x3C3E,
  0x3C40,
  0x3C41,
  0x3C43,
  0x3C45,
  0x3C46,
  0x3C48,
  0x3C4A,
  0x3C4B,
  0x3C4D,
  0x3C4E,
  0x3C50,
  0x3C52,
  0x3C53,
  0x3C55,
  0x3C57,
  0x3C58,
  0x3C5A,
  0x3C5C,
  0x3C5D,
  0x3C5F,
  0x3C60,
  0x3C62,
  0x3C64,
  0x3C65,
  0x3C67,
  0x3C69,
  0x3C6A,
  0x3C6C,
  0x3C6E,
  0x3C6F,
  0x3C71,
  0x3C72,
  0x3C74,
  0x3C76,
  0x3C77,
  0x3C79,
  0x3C7B,
  0x3C7C,
  0x3C7E,
  0x3C80,
  0x3C81,
  0x3C81,
  0x3C82,
  0x3C83,
  0x3C84,
  0x3C85,
  0x3C86,
  0x3C86,
  0x3C87,
  0x3C88,
  0x3C89,
  0x3C8A,
  0x3C8A,
  0x3C8B,
  0x3C8C,
  0x3C8D,
  0x3C8E,
  0x3C8F,
  0x3C8F,
  0x3C90,
  0x3C91,
  0x3C92,
  0x3C93,
  0x3C93,
  0x3C94,
  0x3C95,
  0x3C96,
  0x3C97,
  0x3C98,
  0x3C98,
  0x3C99,
  0x3C9A,
  0x3C9B,
  0x3C9C,
  0x3C9C,
  0x3C9D,
  0x3C9E,
  0x3C9F,
  0x3CA0,
  0x3CA1,
  0x3CA1,
  0x3CA2,
  0x3CA3,
  0x3CA4,
  0x3CA5,
  0x3CA5,
  0x3CA6,
  0x3CA7,
  0x3CA8,
  0x3CA9,
  0x3CAA,
  0x3CAA,
  0x3CAB,
  0x3CAC,
  0x3CAD,
  0x3CAE,
  0x3CAE,
  0x3CAF,
  0x3CB0,
  0x3CB1,
  0x3CB2,
  0x3CB3,
  0x3CB3,
  0x3CB4,
  0x3CB5,
  0x3CB6,
  0x3CB7,
  0x3CB8,
  0x3CB8,
  0x3CB9,
  0x3CBA,
  0x3CBB,
  0x3CBC,
  0x3CBC,
  0x3CBD,
  0x3CBE,
  0x3CBF,
  0x3CC0,
  0x3CC1,
  0x3CC1,
  0x3CC2,
  0x3CC3,
  0x3CC4,
  0x3CC5,
  0x3CC5,
  0x3CC6,
  0x3CC7,
  0x3CC8,
  0x3CC9,
  0x3CCA,
  0x3CCA,
  0x3CCB,
  0x3CCC,
  0x3CCD,
  0x3CCE,
  0x3CCE,
  0x3CCF,
  0x3CD0,
  0x3CD1,
  0x3CD2,
  0x3CD3,
  0x3CD3,
  0x3CD4,
  0x3CD5,
  0x3CD6,
  0x3CD7,
  0x3CD7,
  0x3CD8,
  0x3CD9,
  0x3CDA,
  0x3CDB,
  0x3CDC,
  0x3CDC,
  0x3CDD,
  0x3CDE,
  0x3CDF,
  0x3CE0,
  0x3CE0,
  0x3CE1,
  0x3CE2,
  0x3CE3,
  0x3CE4,
  0x3CE5,
  0x3CE5,
  0x3CE6,
  0x3CE7,
  0x3CE8,
  0x3CE9,
  0x3CE9,
  0x3CEA,
  0x3CEB,
  0x3CEC,
  0x3CED,
  0x3CEE,
  0x3CEE,
  0x3CEF,
  0x3CF0,
  0x3CF1,
  0x3CF2,
  0x3CF2,
  0x3CF3,
  0x3CF4,
  0x3CF5,
  0x3CF6,
  0x3CF7,
  0x3CF7,
  0x3CF8,
  0x3CF9,
  0x3CFA,
  0x3CFB,
  0x3CFB,
  0x3CFC,
  0x3CFD,
  0x3CFE,
  0x3CFF,
  0x3D00,
  0x3D00,
  0x3D01,
  0x3D01,
  0x3D01,
  0x3D02,
  0x3D02,
  0x3D03,
  0x3D03,
  0x3D03,
  0x3D04,
  0x3D04,
  0x3D05,
  0x3D05,
  0x3D06,
  0x3D06,
  0x3D06,
  0x3D07,
  0x3D07,
  0x3D08,
  0x3D08,
  0x3D08,
  0x3D09,
  0x3D09,
  0x3D0A,
  0x3D0A,
  0x3D0A,
  0x3D0B,
  0x3D0B,
  0x3D0C,
  0x3D0C,
  0x3D0C,
  0x3D0D,
  0x3D0D,
  0x3D0E,
  0x3D0E,
  0x3D0F,
  0x3D0F,
  0x3D0F,
  0x3D10,
  0x3D10,
  0x3D11,
  0x3D11,
  0x3D11,
  0x3D12,
  0x3D12,
  0x3D13,
  0x3D13,
  0x3D13,
  0x3D14,
  0x3D14,
  0x3D15,
  0x3D15,
  0x3D16,
  0x3D16,
  0x3D16,
  0x3D17,
  0x3D17,
  0x3D18,
  0x3D18,
  0x3D18,
  0x3D19,
  0x3D19,
  0x3D1A,
  0x3D1A,
  0x3D1A,
  0x3D1B,
  0x3D1B,
  0x3D1C,
  0x3D1C,
  0x3D1C,
  0x3D1D,
  0x3D1D,
  0x3D1E,
  0x3D1E,
  0x3D1F,
  0x3D1F,
  0x3D1F,
  0x3D20,
  0x3D20,
  0x3D21,
  0x3D21,
  0x3D21,
  0x3D22,
  0x3D22,
  0x3D23,
  0x3D23,
  0x3D23,
  0x3D24,
  0x3D24,
  0x3D25,
  0x3D25,
  0x3D25,
  0x3D26,
  0x3D26,
  0x3D27,
  0x3D27,
  0x3D28,
  0x3D28,
  0x3D28,
  0x3D29,
  0x3D29,
  0x3D2A,
  0x3D2A,
  0x3D2A,
  0x3D2B,
  0x3D2B,
  0x3D2C,
  0x3D2C,
  0x3D2C,
  0x3D2D,
  0x3D2D,
  0x3D2E,
  0x3D2E,
  0x3D2E,
  0x3D2F,
  0x3D2F,
  0x3D30,
  0x3D30,
  0x3D31,
  0x3D31,
  0x3D31,
  0x3D32,
  0x3D32,
  0x3D33,
  0x3D33,
  0x3D33,
  0x3D34,
  0x3D34,
  0x3D35,
  0x3D35,
  0x3D35,
  0x3D36,
  0x3D36,
  0x3D37,
  0x3D37,
  0x3D38,
  0x3D38,
  0x3D38,
  0x3D39,
  0x3D39,
  0x3D3A,
  0x3D3A,
  0x3D3A,
  0x3D3B,
  0x3D3B,
  0x3D3C,
  0x3D3C,
  0x3D3C,
  0x3D3D,
  0x3D3D,
  0x3D3E,
  0x3D3E,
  0x3D3E,
  0x3D3F,
  0x3D3F,
  0x3D40,
  0x3D40,
  0x3D41,
  0x3D41,
  0x3D41,
  0x3D42,
  0x3D42,
  0x3D43,
  0x3D43,
  0x3D43,
  0x3D44,
  0x3D44,
  0x3D45,
  0x3D45,
  0x3D45,
  0x3D46,
  0x3D46,
  0x3D47,
  0x3D47,
  0x3D47,
  0x3D48,
  0x3D48,
  0x3D49,
  0x3D49,
  0x3D4A,
  0x3D4A,
  0x3D4A,
  0x3D4B,
  0x3D4B,
  0x3D4C,
  0x3D4C,
  0x3D4C,
  0x3D4D,
  0x3D4D,
  0x3D4E,
  0x3D4E,
  0x3D4E,
  0x3D4F,
  0x3D4F,
  0x3D50,
  0x3D50,
  0x3D50,
  0x3D51,
  0x3D51,
  0x3D52,
  0x3D52,
  0x3D53,
  0x3D53,
  0x3D53,
  0x3D54,
  0x3D54,
  0x3D55,
  0x3D55,
  0x3D55,
  0x3D56,
  0x3D56,
  0x3D57,
  0x3D57,
  0x3D57,
  0x3D58,
  0x3D58,
  0x3D59,
  0x3D59,
  0x3D59,
  0x3D5A,
  0x3D5A,
  0x3D5B,
  0x3D5B,
  0x3D5C,
  0x3D5C,
  0x3D5C,
  0x3D5D,
  0x3D5D,
  0x3D5E,
  0x3D5E,
  0x3D5E,
  0x3D5F,
  0x3D5F,
  0x3D60,
  0x3D60,
  0x3D60,
  0x3D61,
  0x3D61,
  0x3D62,
  0x3D62,
  0x3D63,
  0x3D63,
  0x3D63,
  0x3D64,
  0x3D64,
  0x3D65,
  0x3D65,
  0x3D65,
  0x3D66,
  0x3D66,
  0x3D67,
  0x3D67,
  0x3D67,
  0x3D68,
  0x3D68,
  0x3D69,
  0x3D69,
  0x3D69,
  0x3D6A,
  0x3D6A,
  0x3D6B,
  0x3D6B,
  0x3D6C,
  0x3D6C,
  0x3D6C,
  0x3D6D,
  0x3D6D,
  0x3D6E,
  0x3D6E,
  0x3D6E,
  0x3D6F,
  0x3D6F,
  0x3D70,
  0x3D70,
  0x3D70,
  0x3D71,
  0x3D71,
  0x3D72,
  0x3D72,
  0x3D72,
  0x3D73,
  0x3D73,
  0x3D74,
  0x3D74,
  0x3D75,
  0x3D75,
  0x3D75,
  0x3D76,
  0x3D76,
  0x3D77,
  0x3D77,
  0x3D77,
  0x3D78,
  0x3D78,
  0x3D79,
  0x3D79,
  0x3D79,
  0x3D7A,
  0x3D7A,
  0x3D7B,
  0x3D7B,
  0x3D7B,
  0x3D7C,
  0x3D7C,
  0x3D7D,
  0x3D7D,
  0x3D7E,
  0x3D7E,
  0x3D7E,
  0x3D7F,
  0x3D7F,
  0x3D80,
  0x3D80,
  0x3D80,
  0x3D80,
  0x3D81,
  0x3D81,
  0x3D81,
  0x3D81,
  0x3D81,
  0x3D82,
  0x3D82,
  0x3D82,
  0x3D82,
  0x3D82,
  0x3D83,
  0x3D83,
  0x3D83,
  0x3D83,
  0x3D83,
  0x3D84,
  0x3D84,
  0x3D84,
  0x3D84,
  0x3D85,
  0x3D85,
  0x3D85,
  0x3D85,
  0x3D85,
  0x3D86,
  0x3D86,
  0x3D86,
  0x3D86,
  0x3D86,
  0x3D87,
  0x3D87,
  0x3D87,
  0x3D87,
  0x3D87,
  0x3D88,
  0x3D88,
  0x3D88,
  0x3D88,
  0x3D88,
  0x3D89,
  0x3D89,
  0x3D89,
  0x3D89,
  0x3D89,
  0x3D8A,
  0x3D8A,
  0x3D8A,
  0x3D8A,
  0x3D8A,
  0x3D8B,
  0x3D8B,
  0x3D8B,
  0x3D8B,
  0x3D8B,
  0x3D8C,
  0x3D8C,
  0x3D8C,
  0x3D8C,
  0x3D8C,
  0x3D8D,
  0x3D8D,
  0x3D8D,
  0x3D8D,
  0x3D8E,
  0x3D8E,
  0x3D8E,
  0x3D8E,
  0x3D8E,
  0x3D8F,
  0x3D8F,
  0x3D8F,
  0x3D8F,
  0x3D8F,
  0x3D90,
  0x3D90,
  0x3D90,
  0x3D90,
  0x3D90,
  0x3D91,
  0x3D91,
  0x3D91,
  0x3D91,
  0x3D91,
  0x3D92,
  0x3D92,
  0x3D92,
  0x3D92,
  0x3D92,
  0x3D93,
  0x3D93,
  0x3D93,
  0x3D93,
  0x3D93,
  0x3D94,
  0x3D94,
  0x3D94,
  0x3D94,
  0x3D94,
  0x3D95,
  0x3D95,
  0x3D95,
  0x3D95,
  0x3D96,
  0x3D96,
  0x3D96,
  0x3D96,
  0x3D96,
  0x3D97,
  0x3D97,
  0x3D97,
  0x3D97,
  0x3D97,
  0x3D98,
  0x3D98,
  0x3D98,
  0x3D98,
  0x3D98,
  0x3D99,
  0x3D99,
  0x3D99,
  0x3D99,
  0x3D99,
  0x3D9A,
  0x3D9A,
  0x3D9A,
  0x3D9A,
  0x3D9A,
  0x3D9B,
  0x3D9B,
  0x3D9B,
  0x3D9B,
  0x3D9B,
  0x3D9C,
  0x3D9C,
  0x3D9C,
  0x3D9C,
  0x3D9C,
  0x3D9D,
  0x3D9D,
  0x3D9D,
  0x3D9D,
  0x3D9D,
  0x3D9E,
  0x3D9E,
  0x3D9E,
  0x3D9E,
  0x3D9F,
  0x3D9F,
  0x3D9F,
  0x3D9F,
  0x3D9F,
  0x3DA0,
  0x3DA0,
  0x3DA0,
  0x3DA0,
  0x3DA0,
  0x3DA1,
  0x3DA1,
  0x3DA1,
  0x3DA1,
  0x3DA1,
  0x3DA2,
  0x3DA2,
  0x3DA2,
  0x3DA2,
  0x3DA2,
  0x3DA3,
  0x3DA3,
  0x3DA3,
  0x3DA3,
  0x3DA3,
  0x3DA4,
  0x3DA4,
  0x3DA4,
  0x3DA4,
  0x3DA4,
  0x3DA5,
  0x3DA5,
  0x3DA5,
  0x3DA5,
  0x3DA5,
  0x3DA6,
  0x3DA6,
  0x3DA6,
  0x3DA6,
  0x3DA7,
  0x3DA7,
  0x3DA7,
  0x3DA7,
  0x3DA7,
  0x3DA8,
  0x3DA8,
  0x3DA8,
  0x3DA8,
  0x3DA8,
  0x3DA9,
  0x3DA9,
  0x3DA9,
  0x3DA9,
  0x3DA9,
  0x3DAA,
  0x3DAA,
  0x3DAA,
  0x3DAA,
  0x3DAA,
  0x3DAB,
  0x3DAB,
  0x3DAB,
  0x3DAB,
  0x3DAB,
  0x3DAC,
  0x3DAC,
  0x3DAC,
  0x3DAC,
  0x3DAC,
  0x3DAD,
  0x3DAD,
  0x3DAD,
  0x3DAD,
  0x3DAD,
  0x3DAE,
  0x3DAE,
  0x3DAE,
  0x3DAE,
  0x3DAE,
  0x3DAF,
  0x3DAF,
  0x3DAF,
  0x3DAF,
  0x3DB0,
  0x3DB0,
  0x3DB0,
  0x3DB0,
  0x3DB0,
  0x3DB1,
  0x3DB1,
  0x3DB1,
  0x3DB1,
  0x3DB1,
  0x3DB2,
  0x3DB2,
  0x3DB2,
  0x3DB2,
  0x3DB2,
  0x3DB3,
  0x3DB3,
  0x3DB3,
  0x3DB3,
  0x3DB3,
  0x3DB4,
  0x3DB4,
  0x3DB4,
  0x3DB4,
  0x3DB4,
  0x3DB5,
  0x3DB5,
  0x3DB5,
  0x3DB5,
  0x3DB5,
  0x3DB6,
  0x3DB6,
  0x3DB6,
  0x3DB6,
  0x3DB6,
  0x3DB7,
  0x3DB7,
  0x3DB7,
  0x3DB7,
  0x3DB8,
  0x3DB8,
  0x3DB8,
  0x3DB8,
  0x3DB8,
  0x3DB9,
  0x3DB9,
  0x3DB9,
  0x3DB9,
  0x3DB9,
  0x3DBA,
  0x3DBA,
  0x3DBA,
  0x3DBA,
  0x3DBA,
  0x3DBB,
  0x3DBB,
  0x3DBB,
  0x3DBB,
  0x3DBB,
  0x3DBC,
  0x3DBC,
  0x3DBC,
  0x3DBC,
  0x3DBC,
  0x3DBD,
  0x3DBD,
  0x3DBD,
  0x3DBD,
  0x3DBD,
  0x3DBE,
  0x3DBE,
  0x3DBE,
  0x3DBE,
  0x3DBE,
  0x3DBF,
  0x3DBF,
  0x3DBF,
  0x3DBF,
  0x3DBF,
  0x3DC0,
  0x3DC0,
  0x3DC0,
  0x3DC0,
  0x3DC1,
  0x3DC1,
  0x3DC1,
  0x3DC1,
  0x3DC1,
  0x3DC2,
  0x3DC2,
  0x3DC2,
  0x3DC2,
  0x3DC2,
  0x3DC3,
  0x3DC3,
  0x3DC3,
  0x3DC3,
  0x3DC3,
  0x3DC4,
  0x3DC4,
  0x3DC4,
  0x3DC4,
  0x3DC4,
  0x3DC5,
  0x3DC5,
  0x3DC5,
  0x3DC5,
  0x3DC5,
  0x3DC6,
  0x3DC6,
  0x3DC6,
  0x3DC6,
  0x3DC6,
  0x3DC7,
  0x3DC7,
  0x3DC7,
  0x3DC7,
  0x3DC7,
  0x3DC8,
  0x3DC8,
  0x3DC8,
  0x3DC8,
  0x3DC8,
  0x3DC9,
  0x3DC9,
  0x3DC9,
  0x3DC9,
  0x3DCA,
  0x3DCA,
  0x3DCA,
  0x3DCA,
  0x3DCA,
  0x3DCB,
  0x3DCB,
  0x3DCB,
  0x3DCB,
  0x3DCB,
  0x3DCC,
  0x3DCC,
  0x3DCC,
  0x3DCC,
  0x3DCC,
  0x3DCD,
  0x3DCE,
  0x3DCF,
  0x3DD0,
  0x3DD1,
  0x3DD2,
  0x3DD3,
  0x3DD4,
  0x3DD5,
  0x3DD6,
  0x3DD7,
  0x3DD8,
  0x3DD9,
  0x3DDA,
  0x3DDB,
  0x3DDC,
  0x3DDD,
  0x3DDE,
  0x3DDF,
  0x3DE0,
  0x3DE1,
  0x3DE2,
  0x3DE3,
  0x3DE4,
  0x3DE5,
};

static u16 golden_bf16[] = {
  0x0,
  0x38d2,
  0x3952,
  0x399d,
  0x39d2,
  0x3a03,
  0x3a1d,
  0x3a38,
  0x3a52,
  0x3a6c,
  0x3a83,
  0x3a90,
  0x3a9d,
  0x3aaa,
  0x3ab8,
  0x3ac5,
  0x3ad2,
  0x3adf,
  0x3aec,
  0x3af9,
  0x3b03,
  0x3b0a,
  0x3b10,
  0x3b17,
  0x3b1d,
  0x3b24,
  0x3b2a,
  0x3b31,
  0x3b38,
  0x3b3e,
  0x3b45,
  0x3b4b,
  0x3b52,
  0x3b58,
  0x3b5f,
  0x3b65,
  0x3b6c,
  0x3b72,
  0x3b79,
  0x3b80,
  0x3b83,
  0x3b86,
  0x3b8a,
  0x3b8d,
  0x3b90,
  0x3b93,
  0x3b97,
  0x3b9a,
  0x3b9d,
  0x3ba1,
  0x3ba4,
  0x3ba7,
  0x3baa,
  0x3bae,
  0x3bb1,
  0x3bb4,
  0x3bb8,
  0x3bbb,
  0x3bbe,
  0x3bc1,
  0x3bc5,
  0x3bc8,
  0x3bcb,
  0x3bce,
  0x3bd2,
  0x3bd5,
  0x3bd8,
  0x3bdc,
  0x3bdf,
  0x3be2,
  0x3be5,
  0x3be9,
  0x3bec,
  0x3bef,
  0x3bf2,
  0x3bf6,
  0x3bf9,
  0x3bfc,
  0x3c00,
  0x3c01,
  0x3c03,
  0x3c05,
  0x3c06,
  0x3c08,
  0x3c0a,
  0x3c0b,
  0x3c0d,
  0x3c0f,
  0x3c10,
  0x3c12,
  0x3c13,
  0x3c15,
  0x3c17,
  0x3c18,
  0x3c1a,
  0x3c1c,
  0x3c1d,
  0x3c1f,
  0x3c21,
  0x3c22,
  0x3c24,
  0x3c25,
  0x3c27,
  0x3c29,
  0x3c2a,
  0x3c2c,
  0x3c2e,
  0x3c2f,
  0x3c31,
  0x3c33,
  0x3c34,
  0x3c36,
  0x3c38,
  0x3c39,
  0x3c3b,
  0x3c3c,
  0x3c3e,
  0x3c40,
  0x3c41,
  0x3c43,
  0x3c45,
  0x3c46,
  0x3c48,
  0x3c4a,
  0x3c4b,
  0x3c4d,
  0x3c4e,
  0x3c50,
  0x3c52,
  0x3c53,
  0x3c55,
  0x3c57,
  0x3c58,
  0x3c5a,
  0x3c5c,
  0x3c5d,
  0x3c5f,
  0x3c60,
  0x3c62,
  0x3c64,
  0x3c65,
  0x3c67,
  0x3c69,
  0x3c6a,
  0x3c6c,
  0x3c6e,
  0x3c6f,
  0x3c71,
  0x3c72,
  0x3c74,
  0x3c76,
  0x3c77,
  0x3c79,
  0x3c7b,
  0x3c7c,
  0x3c7e,
  0x3c80,
  0x3c81,
  0x3c81,
  0x3c82,
  0x3c83,
  0x3c84,
  0x3c85,
  0x3c86,
  0x3c86,
  0x3c87,
  0x3c88,
  0x3c89,
  0x3c8a,
  0x3c8a,
  0x3c8b,
  0x3c8c,
  0x3c8d,
  0x3c8e,
  0x3c8f,
  0x3c8f,
  0x3c90,
  0x3c91,
  0x3c92,
  0x3c93,
  0x3c93,
  0x3c94,
  0x3c95,
  0x3c96,
  0x3c97,
  0x3c98,
  0x3c98,
  0x3c99,
  0x3c9a,
  0x3c9b,
  0x3c9c,
  0x3c9c,
  0x3c9d,
  0x3c9e,
  0x3c9f,
  0x3ca0,
  0x3ca1,
  0x3ca1,
  0x3ca2,
  0x3ca3,
  0x3ca4,
  0x3ca5,
  0x3ca5,
  0x3ca6,
  0x3ca7,
  0x3ca8,
  0x3ca9,
  0x3caa,
  0x3caa,
  0x3cab,
  0x3cac,
  0x3cad,
  0x3cae,
  0x3cae,
  0x3caf,
  0x3cb0,
  0x3cb1,
  0x3cb2,
  0x3cb3,
  0x3cb3,
  0x3cb4,
  0x3cb5,
  0x3cb6,
  0x3cb7,
  0x3cb8,
  0x3cb8,
  0x3cb9,
  0x3cba,
  0x3cbb,
  0x3cbc,
  0x3cbc,
  0x3cbd,
  0x3cbe,
  0x3cbf,
  0x3cc0,
  0x3cc1,
  0x3cc1,
  0x3cc2,
  0x3cc3,
  0x3cc4,
  0x3cc5,
  0x3cc5,
  0x3cc6,
  0x3cc7,
  0x3cc8,
  0x3cc9,
  0x3cca,
  0x3cca,
  0x3ccb,
  0x3ccc,
  0x3ccd,
  0x3cce,
  0x3cce,
  0x3ccf,
  0x3cd0,
  0x3cd1,
  0x3cd2,
  0x3cd3,
  0x3cd3,
  0x3cd4,
  0x3cd5,
  0x3cd6,
  0x3cd7,
  0x3cd7,
  0x3cd8,
  0x3cd9,
  0x3cda,
  0x3cdb,
  0x3cdc,
  0x3cdc,
  0x3cdd,
  0x3cde,
  0x3cdf,
  0x3ce0,
  0x3ce0,
  0x3ce1,
  0x3ce2,
  0x3ce3,
  0x3ce4,
  0x3ce5,
  0x3ce5,
  0x3ce6,
  0x3ce7,
  0x3ce8,
  0x3ce9,
  0x3ce9,
  0x3cea,
  0x3ceb,
  0x3cec,
  0x3ced,
  0x3cee,
  0x3cee,
  0x3cef,
  0x3cf0,
  0x3cf1,
  0x3cf2,
  0x3cf2,
  0x3cf3,
  0x3cf4,
  0x3cf5,
  0x3cf6,
  0x3cf7,
  0x3cf7,
  0x3cf8,
  0x3cf9,
  0x3cfa,
  0x3cfb,
  0x3cfb,
  0x3cfc,
  0x3cfd,
  0x3cfe,
  0x3cff,
  0x3d00,
  0x3d00,
  0x3d01,
  0x3d01,
  0x3d01,
  0x3d02,
  0x3d02,
  0x3d03,
  0x3d03,
  0x3d03,
  0x3d04,
  0x3d04,
  0x3d05,
  0x3d05,
  0x3d06,
  0x3d06,
  0x3d06,
  0x3d07,
  0x3d07,
  0x3d08,
  0x3d08,
  0x3d08,
  0x3d09,
  0x3d09,
  0x3d0a,
  0x3d0a,
  0x3d0a,
  0x3d0b,
  0x3d0b,
  0x3d0c,
  0x3d0c,
  0x3d0c,
  0x3d0d,
  0x3d0d,
  0x3d0e,
  0x3d0e,
  0x3d0f,
  0x3d0f,
  0x3d0f,
  0x3d10,
  0x3d10,
  0x3d11,
  0x3d11,
  0x3d11,
  0x3d12,
  0x3d12,
  0x3d13,
  0x3d13,
  0x3d13,
  0x3d14,
  0x3d14,
  0x3d15,
  0x3d15,
  0x3d16,
  0x3d16,
  0x3d16,
  0x3d17,
  0x3d17,
  0x3d18,
  0x3d18,
  0x3d18,
  0x3d19,
  0x3d19,
  0x3d1a,
  0x3d1a,
  0x3d1a,
  0x3d1b,
  0x3d1b,
  0x3d1c,
  0x3d1c,
  0x3d1c,
  0x3d1d,
  0x3d1d,
  0x3d1e,
  0x3d1e,
  0x3d1f,
  0x3d1f,
  0x3d1f,
  0x3d20,
  0x3d20,
  0x3d21,
  0x3d21,
  0x3d21,
  0x3d22,
  0x3d22,
  0x3d23,
  0x3d23,
  0x3d23,
  0x3d24,
  0x3d24,
  0x3d25,
  0x3d25,
  0x3d25,
  0x3d26,
  0x3d26,
  0x3d27,
  0x3d27,
  0x3d28,
  0x3d28,
  0x3d28,
  0x3d29,
  0x3d29,
  0x3d2a,
  0x3d2a,
  0x3d2a,
  0x3d2b,
  0x3d2b,
  0x3d2c,
  0x3d2c,
  0x3d2c,
  0x3d2d,
  0x3d2d,
  0x3d2e,
  0x3d2e,
  0x3d2e,
  0x3d2f,
  0x3d2f,
  0x3d30,
  0x3d30,
  0x3d31,
  0x3d31,
  0x3d31,
  0x3d32,
  0x3d32,
  0x3d33,
  0x3d33,
  0x3d33,
  0x3d34,
  0x3d34,
  0x3d35,
  0x3d35,
  0x3d35,
  0x3d36,
  0x3d36,
  0x3d37,
  0x3d37,
  0x3d38,
  0x3d38,
  0x3d38,
  0x3d39,
  0x3d39,
  0x3d3a,
  0x3d3a,
  0x3d3a,
  0x3d3b,
  0x3d3b,
  0x3d3c,
  0x3d3c,
  0x3d3c,
  0x3d3d,
  0x3d3d,
  0x3d3e,
  0x3d3e,
  0x3d3e,
  0x3d3f,
  0x3d3f,
  0x3d40,
  0x3d40,
  0x3d41,
  0x3d41,
  0x3d41,
  0x3d42,
  0x3d42,
  0x3d43,
  0x3d43,
  0x3d43,
  0x3d44,
  0x3d44,
  0x3d45,
  0x3d45,
  0x3d45,
  0x3d46,
  0x3d46,
  0x3d47,
  0x3d47,
  0x3d47,
  0x3d48,
  0x3d48,
  0x3d49,
  0x3d49,
  0x3d4a,
  0x3d4a,
  0x3d4a,
  0x3d4b,
  0x3d4b,
  0x3d4c,
  0x3d4c,
  0x3d4c,
  0x3d4d,
  0x3d4d,
  0x3d4e,
  0x3d4e,
  0x3d4e,
  0x3d4f,
  0x3d4f,
  0x3d50,
  0x3d50,
  0x3d50,
  0x3d51,
  0x3d51,
  0x3d52,
  0x3d52,
  0x3d53,
  0x3d53,
  0x3d53,
  0x3d54,
  0x3d54,
  0x3d55,
  0x3d55,
  0x3d55,
  0x3d56,
  0x3d56,
  0x3d57,
  0x3d57,
  0x3d57,
  0x3d58,
  0x3d58,
  0x3d59,
  0x3d59,
  0x3d59,
  0x3d5a,
  0x3d5a,
  0x3d5b,
  0x3d5b,
  0x3d5c,
  0x3d5c,
  0x3d5c,
  0x3d5d,
  0x3d5d,
  0x3d5e,
  0x3d5e,
  0x3d5e,
  0x3d5f,
  0x3d5f,
  0x3d60,
  0x3d60,
  0x3d60,
  0x3d61,
  0x3d61,
  0x3d62,
  0x3d62,
  0x3d63,
  0x3d63,
  0x3d63,
  0x3d64,
  0x3d64,
  0x3d65,
  0x3d65,
  0x3d65,
  0x3d66,
  0x3d66,
  0x3d67,
  0x3d67,
  0x3d67,
  0x3d68,
  0x3d68,
  0x3d69,
  0x3d69,
  0x3d69,
  0x3d6a,
  0x3d6a,
  0x3d6b,
  0x3d6b,
  0x3d6c,
  0x3d6c,
  0x3d6c,
  0x3d6d,
  0x3d6d,
  0x3d6e,
  0x3d6e,
  0x3d6e,
  0x3d6f,
  0x3d6f,
  0x3d70,
  0x3d70,
  0x3d70,
  0x3d71,
  0x3d71,
  0x3d72,
  0x3d72,
  0x3d72,
  0x3d73,
  0x3d73,
  0x3d74,
  0x3d74,
  0x3d75,
  0x3d75,
  0x3d75,
  0x3d76,
  0x3d76,
  0x3d77,
  0x3d77,
  0x3d77,
  0x3d78,
  0x3d78,
  0x3d79,
  0x3d79,
  0x3d79,
  0x3d7a,
  0x3d7a,
  0x3d7b,
  0x3d7b,
  0x3d7b,
  0x3d7c,
  0x3d7c,
  0x3d7d,
  0x3d7d,
  0x3d7e,
  0x3d7e,
  0x3d7e,
  0x3d7f,
  0x3d7f,
  0x3d80,
  0x3d80,
  0x3d80,
  0x3d80,
  0x3d81,
  0x3d81,
  0x3d81,
  0x3d81,
  0x3d81,
  0x3d82,
  0x3d82,
  0x3d82,
  0x3d82,
  0x3d82,
  0x3d83,
  0x3d83,
  0x3d83,
  0x3d83,
  0x3d83,
  0x3d84,
  0x3d84,
  0x3d84,
  0x3d84,
  0x3d85,
  0x3d85,
  0x3d85,
  0x3d85,
  0x3d85,
  0x3d86,
  0x3d86,
  0x3d86,
  0x3d86,
  0x3d86,
  0x3d87,
  0x3d87,
  0x3d87,
  0x3d87,
  0x3d87,
  0x3d88,
  0x3d88,
  0x3d88,
  0x3d88,
  0x3d88,
  0x3d89,
  0x3d89,
  0x3d89,
  0x3d89,
  0x3d89,
  0x3d8a,
  0x3d8a,
  0x3d8a,
  0x3d8a,
  0x3d8a,
  0x3d8b,
  0x3d8b,
  0x3d8b,
  0x3d8b,
  0x3d8b,
  0x3d8c,
  0x3d8c,
  0x3d8c,
  0x3d8c,
  0x3d8c,
  0x3d8d,
  0x3d8d,
  0x3d8d,
  0x3d8d,
  0x3d8e,
  0x3d8e,
  0x3d8e,
  0x3d8e,
  0x3d8e,
  0x3d8f,
  0x3d8f,
  0x3d8f,
  0x3d8f,
  0x3d8f,
  0x3d90,
  0x3d90,
  0x3d90,
  0x3d90,
  0x3d90,
  0x3d91,
  0x3d91,
  0x3d91,
  0x3d91,
  0x3d91,
  0x3d92,
  0x3d92,
  0x3d92,
  0x3d92,
  0x3d92,
  0x3d93,
  0x3d93,
  0x3d93,
  0x3d93,
  0x3d93,
  0x3d94,
  0x3d94,
  0x3d94,
  0x3d94,
  0x3d94,
  0x3d95,
  0x3d95,
  0x3d95,
  0x3d95,
  0x3d96,
  0x3d96,
  0x3d96,
  0x3d96,
  0x3d96,
  0x3d97,
  0x3d97,
  0x3d97,
  0x3d97,
  0x3d97,
  0x3d98,
  0x3d98,
  0x3d98,
  0x3d98,
  0x3d98,
  0x3d99,
  0x3d99,
  0x3d99,
  0x3d99,
  0x3d99,
  0x3d9a,
  0x3d9a,
  0x3d9a,
  0x3d9a,
  0x3d9a,
  0x3d9b,
  0x3d9b,
  0x3d9b,
  0x3d9b,
  0x3d9b,
  0x3d9c,
  0x3d9c,
  0x3d9c,
  0x3d9c,
  0x3d9c,
  0x3d9d,
  0x3d9d,
  0x3d9d,
  0x3d9d,
  0x3d9d,
  0x3d9e,
  0x3d9e,
  0x3d9e,
  0x3d9e,
  0x3d9f,
  0x3d9f,
  0x3d9f,
  0x3d9f,
  0x3d9f,
  0x3da0,
  0x3da0,
  0x3da0,
  0x3da0,
  0x3da0,
  0x3da1,
  0x3da1,
  0x3da1,
  0x3da1,
  0x3da1,
  0x3da2,
  0x3da2,
  0x3da2,
  0x3da2,
  0x3da2,
  0x3da3,
  0x3da3,
  0x3da3,
  0x3da3,
  0x3da3,
  0x3da4,
  0x3da4,
  0x3da4,
  0x3da4,
  0x3da4,
  0x3da5,
  0x3da5,
  0x3da5,
  0x3da5,
  0x3da5,
  0x3da6,
  0x3da6,
  0x3da6,
  0x3da6,
  0x3da7,
  0x3da7,
  0x3da7,
  0x3da7,
  0x3da7,
  0x3da8,
  0x3da8,
  0x3da8,
  0x3da8,
  0x3da8,
  0x3da9,
  0x3da9,
  0x3da9,
  0x3da9,
  0x3da9,
  0x3daa,
  0x3daa,
  0x3daa,
  0x3daa,
  0x3daa,
  0x3dab,
  0x3dab,
  0x3dab,
  0x3dab,
  0x3dab,
  0x3dac,
  0x3dac,
  0x3dac,
  0x3dac,
  0x3dac,
  0x3dad,
  0x3dad,
  0x3dad,
  0x3dad,
  0x3dad,
  0x3dae,
  0x3dae,
  0x3dae,
  0x3dae,
  0x3dae,
  0x3daf,
  0x3daf,
  0x3daf,
  0x3daf,
  0x3db0,
  0x3db0,
  0x3db0,
  0x3db0,
  0x3db0,
  0x3db1,
  0x3db1,
  0x3db1,
  0x3db1,
  0x3db1,
  0x3db2,
  0x3db2,
  0x3db2,
  0x3db2,
  0x3db2,
  0x3db3,
  0x3db3,
  0x3db3,
  0x3db3,
  0x3db3,
  0x3db4,
  0x3db4,
  0x3db4,
  0x3db4,
  0x3db4,
  0x3db5,
  0x3db5,
  0x3db5,
  0x3db5,
  0x3db5,
  0x3db6,
  0x3db6,
  0x3db6,
  0x3db6,
  0x3db6,
  0x3db7,
  0x3db7,
  0x3db7,
  0x3db7,
  0x3db8,
  0x3db8,
  0x3db8,
  0x3db8,
  0x3db8,
  0x3db9,
  0x3db9,
  0x3db9,
  0x3db9,
  0x3db9,
  0x3dba,
  0x3dba,
  0x3dba,
  0x3dba,
  0x3dba,
  0x3dbb,
  0x3dbb,
  0x3dbb,
  0x3dbb,
  0x3dbb,
  0x3dbc,
  0x3dbc,
  0x3dbc,
  0x3dbc,
  0x3dbc,
  0x3dbd,
  0x3dbd,
  0x3dbd,
  0x3dbd,
  0x3dbd,
  0x3dbe,
  0x3dbe,
  0x3dbe,
  0x3dbe,
  0x3dbe,
  0x3dbf,
  0x3dbf,
  0x3dbf,
  0x3dbf,
  0x3dbf,
  0x3dc0,
  0x3dc0,
  0x3dc0,
  0x3dc0,
  0x3dc0,
  0x3dc0,
  0x3dc0,
  0x3dc0,
  0x3dc0,
  0x3dc1,
  0x3dc1,
  0x3dc1,
  0x3dc1,
  0x3dc1,
  0x3dc2,
  0x3dc2,
  0x3dc2,
  0x3dc2,
  0x3dc2,
  0x3dc3,
  0x3dc3,
  0x3dc3,
  0x3dc3,
  0x3dc3,
  0x3dc4,
  0x3dc4,
  0x3dc4,
  0x3dc4,
  0x3dc4,
  0x3dc5,
  0x3dc5,
  0x3dc5,
  0x3dc5,
  0x3dc5,
  0x3dc6,
  0x3dc6,
  0x3dc6,
  0x3dc6,
  0x3dc6,
  0x3dc7,
  0x3dc7,
  0x3dc7,
  0x3dc7,
  0x3dc7,
  0x3dc8,
  0x3dc8,
  0x3dc8,
  0x3dc8,
  0x3dc9,
  0x3dc9,
  0x3dc9,
  0x3dc9,
  0x3dc9,
  0x3dca,
  0x3dca,
  0x3dca,
  0x3dca,
  0x3dca,
  0x3dcb,
  0x3dcb,
  0x3dcb,
  0x3dcb,
  0x3dcb,
  0x3dcc,
  0x3dcd,
  0x3dce,
  0x3dcf,
  0x3dd0,
  0x3dd1,
  0x3dd2,
  0x3dd3,
  0x3dd4,
  0x3dd5,
  0x3dd6,
  0x3dd7,
  0x3dd8,
  0x3dd9,
  0x3dda,
  0x3ddb,
  0x3ddc,
  0x3ddd,
  0x3dde,
  0x3ddf,
  0x3de0,
  0x3de1,
  0x3de2,
  0x3de3,
  0x3de4,
};

// <! gen atan f(x) = atan(x)
static double _gen_atan(float i) {
  return atan(i);
}

static void tl_lut_ref(
    u16 *ofmap,
    u16 *ifmap,
    tl_shape_t ifmap_shape
    )
{
  assert(ofmap);

#if 0
  #define INFP32FILE "infp32file.bin"
  #define OUTBF16FILE "lutbf16out.bin"
  FILE* pFile;
  pFile = fopen(INFP32FILE, "wb");
  int shape_sz = tl_shape_size(&ifmap_shape);
  float *f = (float *)malloc(sizeof(float) * shape_sz);
  for (int i = 0; i < shape_sz; i++) {
    f[i] = convert_bf16_fp32(ifmap[i]);
  }
  fwrite(f, 1, shape_sz *sizeof(float), pFile);
  fclose(pFile);

  // 2. read result from `eval_lut.py`
  char command[256]; // 7 means atan
  sprintf(command, "python eval_lut.py --func_id 7 --lut_input_range_start %d --lut_input_range_end %d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
      range_start, range_end,
      INFP32FILE, OUTBF16FILE);

  int r;
  r = system(command);
  printf ("command is %s, return %d\n", command, r);

  pFile = fopen(OUTBF16FILE, "rb");
  if (!pFile) {
    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
    exit(-1);
  }

  size_t file_length;
  file_length = fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
  printf("read from golden, file size %" PRIu64 "\n", file_length);
  fclose(pFile);
#else
  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
    float f = convert_bf16_fp32(ifmap[i]);
    double v = _gen_atan(f);
    ofmap[i] = convert_fp32_bf16(v);

	if (mode == PRE_DATA_COMPARE_FIX) {
      ofmap[i] = golden_bf16[i];
    }
	else if (mode == DATA_COMPARE_U8) {
      ofmap[i] = (u8) convert_bf16_s8(ofmap[i]);
	}
  }
#endif
}

static void gen_y0(u16 *table_data_y0, u64 table_size,
    int range_start, int range_end) {

  float scale = table_hw / (1.0 * abs(range_start - range_end));
  //<! 32*8 table, duplicate `channel` times;
  int half = table_size / channel / 2;
  double s;
  u64 idx = 0;

  assert(table_size);
  assert(half == 128);

  // prepare channel 0
  // x [0, 127]
  for (int i = 0; i < half; i++) {
    float _idx = idx / scale;
    s = _gen_atan(_idx);
    lut[idx] = s;
    table_data_y0[idx] = convert_fp32_bf16(s);
#ifdef DBG
    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx, convert_bf16_fp32(table_data_y0[idx]), i, table_data_y0[idx], (float)s, s, _idx);
#endif
    idx++;
  }

  // x = -128
  s = _gen_atan(range_start);
  lut[idx] = s;
  table_data_y0[idx] = convert_fp32_bf16(s);
#ifdef DBG
  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data_y0[idx]), 0, table_data_y0[idx]);
#endif
  idx++;

  // x [-128~-1], 2's complement
  for (int i = 1; i < half; i++) {
    float _idx = (i) / scale;
    s = _gen_atan(range_start + _idx);
    lut[idx] = s;
    table_data_y0[idx] = convert_fp32_bf16(s);
#ifdef DBG
    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx, convert_bf16_fp32(table_data_y0[idx]), -127 + i, table_data_y0[idx], (float)s, s, range_start + _idx);
#endif
    idx++;
  }

  // idx = 255 dont care
  //s = _gen_atan(2, 0);
  //table_data_y0[idx] = convert_fp32_bf16(s);
  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data_y0[idx]), 0);
  //idx++;

  // duplicate channel #1 to #31
  //TODO: tensor copy
  for (u32 i = 1; i < channel; i++) {
    memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(u16) * table_hw);
  }
}

static void gen_slope(u16 IN *table_data_y0, u16* OUT table_slope, u64 table_size,
    int range_start, int range_end) {

  float scale = table_hw / (1.0 * abs(range_start - range_end));
  u32 half = table_size / channel / 2;
  assert(half == 128);
  assert(table_data_y0);

  for (u32 i = 0; i < table_hw; i++) {
    double x0 = lut[i];
    double x1 = lut[i+1];
    double delta = 1.0;
    if (i == half - 1) {
      //<! slope[127] means f(127)~f(128)
      double f = _gen_atan(range_end);
      x1 = f;
    }
    else if (i == half) {
      // 128 index mean x1 is -129 and x0 is -128
      x1 = _gen_atan(range_start - 1/scale);
      delta = -1.0;
    }
    else if (i > half) {
      x0 = lut[i];
      x1 = lut[i-1];
      delta = -1.0;
    }
    double s = (x1 - x0) / delta; // x1 already scale up
    table_slope[i] = convert_fp32_bf16((float)s);
#ifdef DBG
    printf ("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n",
        i, convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1-x0);
#endif
  }

#if 0 //def DBG
  for (u32 i = 0; i < 2 * half; i++) {
	printf("slope [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_slope[i]),
		table_slope[i]);
  }
#endif /* ifdef DBG */

  // duplicate channel #1 to #31
  //TODO: tensor copy
  for (u64 i = 1; i < channel; i++) {
    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
  }
}

static bool verify(u16 *ofmap_data, u16 *ref_data, u16* ifmap, u64 ifmap_size, float epsilon) {
  u64 size = ifmap_size;

  for (u64 i = 0; i < size; i++) {
    bool is_close;
	u16 ref = ref_data[i];
	u16 ofmap_data_bf16;
	float ref_f;
	float ofmap_data_f;
	u32 shift;
    
	if (mode == DATA_COMPARE_U8) {
	  shift = (i%2)*8;
	  ofmap_data_bf16 = (u16)ofmap_data[i/2];
	  ofmap_data_f = (float)(ofmap_data[i/2] >> shift);
	  ref_f = (float)(ref);

      is_close = ((u8)(ofmap_data[i/2] >> shift)) == (u8)ref;

	  //printf("[%" PRIu64 "] of is %x ref is %x\n", i, (u8)(ofmap_data[i/2] >> shift), (u8)ref);
	}
	else {
	  ref_f = convert_bf16_fp32(ref);
	  ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
	  ofmap_data_bf16 = ofmap_data[i];

	  if (mode == PRE_DATA_COMPARE_FIX) {
		is_close = ofmap_data[i] == ref;
	  }
	  else {
		is_close = almost_equal(ref_f, ofmap_data_f, 1);
		is_close = fabs(ref_f-ofmap_data_f) < epsilon;
	  }
	}

#if 0
	if (i == 0) {
	  fprintf(stderr,
		  "input, ofmap, ref, diff, diff / ref_f\n");
    }

    fprintf(stderr,
        "%.16f, %f, %lf, %lf, %lf\n",
        convert_bf16_fp32(ifmap[i]),
        ofmap_data_f, ref_f, fabs(ref_f - ofmap_data_f), fabs(ref_f - ofmap_data_f) / ref_f);
    //if (ofmap_data[i] != ref && fabs(ref_f-ofmap_data_f) > 0.07) 
    //if (ofmap_data[i] != ref && AlmostEqual2sComplement(ref_f, ofmap_data_f, 1))
    //if (ofmap_data[i] != ref && AlmostEqual(ref_f, ofmap_data_f, FLT_EPSILON))
#endif
    if (!is_close) {
      float input = convert_bf16_fp32(ifmap[i]);
      fprintf(stderr,
          "comparing failed at ofmap_data[%" PRIu64 "](input:%f)\n"
          "\tgot %x, exp %x, fp32: got %f exp %f, atan(%f) = %f\n",
          i, input,
          ofmap_data_bf16, ref, ofmap_data_f, ref_f,
          input, _gen_atan(input));
      exit(-1);
    }
  }

  return true;
}

/*
 * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
 *
 * \tl_buf tmp buffer, the shape MUST be same type/shape with \tl_ifmap
 * \tl_y0 tmp buffer for lut used, shape should be <1,32,32,8>
 * \tl_slope tmp buffer for lut used, shape should be <1,32,32,8>
 * \tl_ofmap_u8 result as u8 type, NULL means use bf16 result
 * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
 * \range_start, \range_end specify data range, default range is -8 ~ +8
 */
static int bf16_emit(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk,
	tl_t* tl_ifmap,
	tl_t* tl_buf,
    tl_t* tl_y0_buf,
    tl_t* tl_slope_buf,
	tl_t* OUT tl_ofmap_bf16,
	tl_t* OUT tl_ofmap_u8,
    int range_start, int range_end
  ) {

  assert(tl_y0_buf->shape.n == tl_slope_buf->shape.n);
  assert(tl_y0_buf->shape.c == tl_slope_buf->shape.c);
  assert(tl_y0_buf->shape.h == tl_slope_buf->shape.h);
  assert(tl_y0_buf->shape.w == tl_slope_buf->shape.w);

  tl_shape_t tl_shape_int8 = {1, channel, tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1};

  fmt_t fmt = FMT_BF16;

  int data_type_size = bytesize_of_fmt(fmt);
  u64 table_size = tl_shape_size(&tl_y0_buf->shape);
  u64 table_bytesize  =  table_size * data_type_size;

  u16 *table_data_y0 = (u16 *)xmalloc(table_bytesize);
  gen_y0 (table_data_y0, table_size, range_start, range_end);

  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
  gen_slope(table_data_y0, table_data_slope, table_size, range_start, range_end);

  float scale = table_hw / (1.0 * abs(range_start - range_end));

  // prepare load data from sys->local
  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p2, copy_p3;
  memset(&copy_p2, 0, sizeof(copy_p2));
  memset(&copy_p3, 0, sizeof(copy_p3));

  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_y0_buf, table_data_y0, fmt, &copy_p2);
  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_slope_buf, table_data_slope, fmt, &copy_p3);

  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table mantissa

  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
  memset(&p10, 0, sizeof(p10));

  // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
  bmk1880v2_tiu_element_wise_mul_param_t p1;
  memset(&p1, 0, sizeof(p1));
  p1.res_high = NULL;
  p1.res_low = tl_ifmap;
  p1.a = tl_ifmap;
  p1.b_is_const = 1;
  p1.b_const.val = convert_fp32_bf16(scale);
  p1.rshift_bits = 0;
  p1.relu_enable = 0;
  bmk1880v2_tiu_element_wise_mul(bmk, &p1);

  // <! get idx from bf16->int8
  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
  bmk1880v2_tensor_lmem_t dst;
  memcpy(&dst, tl_ofmap_bf16, sizeof(bmk1880v2_tensor_lmem_t)); 
  dst.fmt = FMT_I8;
  dst.shape = tl_shape_int8;
  dst.stride = bmk1880v2_tensor_lmem_default_stride(bmk, dst.shape, dst.fmt, /*eu_align*/ 1);
  dst.stride.h = dst.stride.h * 2;
  dst.int8_rnd_mode = 1;
  p10.dst = &dst;
  p10.src = tl_ifmap;
  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
  test_submit(ctx);
  dst.int8_rnd_mode = 0; // reset

  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
  p10.dst = tl_buf; //<! bf16
  p10.src = &dst;
  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
  test_submit(ctx);

  // <! sub, diff base , a - b
  // (x - x0)
  bmk1880v2_tiu_element_wise_sub_param_t p5;
  memset(&p5, 0, sizeof(p5));
  p5.res_high = 0;
  p5.res_low = tl_ifmap;
  p5.a_high = 0;
  p5.a_low = tl_ifmap;
  p5.b_high = 0;
  p5.b_low = tl_buf;
  p5.rshift_bits = 0;
  bmk1880v2_tiu_element_wise_sub(bmk, &p5);

  // get f(x0) and slope(x)
  // reshape, 16->16
  dst.fmt = fmt;
  dst.shape = tl_buf->shape;
  dst.stride = tl_buf->stride;

  // <! get slope by index
  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
  bmk1880v2_tiu_lookup_table_param_t p12;
  memset(&p12, 0x0, sizeof(p12));
  p12.ofmap = tl_buf;
  p12.ifmap = &dst;
  p12.table = tl_slope_buf;
  bmk1880v2_tiu_lookup_table(bmk, &p12);

  // base f(x0)
  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
  p12.ofmap = tl_ofmap_bf16;
  p12.ifmap = &dst;
  p12.table = tl_y0_buf;
  bmk1880v2_tiu_lookup_table(bmk, &p12);

  // <! mac
  // <! part A + part B, a * b + res = res
  bmk1880v2_tiu_element_wise_mac_param_t p2;
  memset(&p2, 0, sizeof(p2));
  p2.res_high = 0;
  p2.res_low = tl_ofmap_bf16;
  p2.res_is_int8 = 0;
  p2.a = tl_ifmap;
  p2.b_is_const = 0;
  p2.b = tl_buf;
  p2.lshift_bits = 0;
  p2.rshift_bits = 0;
  p2.relu_enable = 0;
  bmk1880v2_tiu_element_wise_mac(bmk, &p2);

  if (tl_ofmap_u8) {
	p10.dst = tl_ofmap_u8;
	p10.src = tl_ofmap_bf16;
	bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
  }

  test_submit(ctx);

  free(table_data_y0);
  free(table_data_slope);

  return 0;
}

static void gen_input(u16 *input_data, u64 ifmap_size, TEST_MODE mode, 
    int range_start, int range_end) {

  if (mode == PRE_DATA_COMPARE_FIX) {
    memcpy(input_data, &test_pattern, sizeof(test_pattern));
  }
  else {
    std::random_device rd;
    std::mt19937 e2(rd());
    std::uniform_real_distribution<> dist(range_start, range_end);
    for (u64 i = 0; i < ifmap_size; i++) {
      // input range is -8 ~ +8
      float input = ((int)i % (range_end-2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
      //float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
      //float input = dist(e2);
      input_data[i] = convert_fp32_bf16(input);
    }
  }

#ifdef DBG
  for (u64 i = 0; i < ifmap_size; i++) {
    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(input_data[i]), input_data[i], floor(log2((convert_bf16_fp32(input_data[i])))));
  }
#endif /* ifdef DBG */

}

static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
{
  // TODO: check more shape / align
  u32 input_n = 1;
  u32 input_c = channel;
  u32 input_h = 16;
  u32 input_w = 16;
  float epsilon = 0.01;
  int range_start = -8;
  int range_end = 8;

  if (mode == PRE_DATA_COMPARE_FIX) {
    input_h = 4;
    input_w = 8;
  }

  tl_shape_t ifmap_shape= {input_n, input_c, input_h, input_w};
  tl_shape_t ofmap_shape = ifmap_shape;
  tl_shape_t table_shape = {input_n, channel, table_h, table_w}; // hard code for hw, hw:32x8

  u64 ifmap_size = tl_shape_size(&ifmap_shape);
  u64 ofmap_size = tl_shape_size(&ofmap_shape);

  fmt_t fmt = FMT_BF16;

  tl_t *tl_ifmap = alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
  tl_t *tl_buf = alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
  tl_t *tl_ofmap_bf16 = alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
  tl_t *tl_table_answer_y0 = alloc_tl(bmk, table_shape, fmt, /*align*/1);
  tl_t *tl_table_answer_slope = alloc_tl(bmk, table_shape, fmt, /*align*/1);
  tl_t *tl_ofmap_u8 = nullptr;


  int data_type_size = bytesize_of_fmt(fmt);
  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
  u64 ofmap_bytesize  =  ofmap_size * data_type_size;

  u16 *input_data = (u16 *)xmalloc(ifmap_bytesize);
  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);

  gen_input(input_data, ifmap_size, mode, range_start, range_end);
  tl_lut_ref(ref_data, input_data, ifmap_shape);

  tl_t *out = tl_ofmap_bf16;

  if (mode == DATA_COMPARE_U8) {
	tl_ofmap_u8 =
	  alloc_tl(bmk,ofmap_shape, FMT_U8, /*align*/1);
	out = tl_ofmap_u8;
  }
  
  // <! FIXME: prepare it
  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1;
  memset(&copy_p1, 0, sizeof(copy_p1));
  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, input_data, fmt, &copy_p1);
  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input

  bf16_emit(ctx, bmk,
	tl_ifmap,
	tl_buf,
    tl_table_answer_y0,
    tl_table_answer_slope,
	OUT tl_ofmap_bf16,
	OUT tl_ofmap_u8,
    range_start, range_end);

  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
  verify(ofmap_data, ref_data, input_data, ifmap_size, epsilon);

  if (tl_ofmap_u8) {
	free_tl(bmk, tl_ofmap_u8);
  }

  free_tl(bmk, tl_table_answer_slope);
  free_tl(bmk, tl_table_answer_y0);
  free_tl(bmk, tl_ofmap_bf16);
  free_tl(bmk, tl_buf);
  free_tl(bmk, tl_ifmap);

  free(input_data);
  free(ref_data);
  free(ofmap_data);
}

int main()
{
  CVI_RT_HANDLE ctx;
  bmk_ctx_t *bmk;
  int round_mode;

  round_mode = set_store_feround();

  test_init(&ctx, &bmk);

  //for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++)
  //for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++)
  {
    mode = static_cast<TEST_MODE>(i);
    printf ("test mode %d...\n", mode);
    test_tl_int8_lut_bf16(&ctx, bmk);
  }

  test_exit(&ctx);
  restore_feround(round_mode);
  return 0;
}
